# Please ensure the working directory is the same as
# that for replication.R (this file)
setwd("~/Dropbox/Environment (Aseem, Johannes)/Analysis/Replication/")
source("Packages.R")
packages <- c("cluster", "corrplot", "dendextend",
	      "factoextra","foreign", "forcats","fpc", 
	      "ggfortify", "ggthemes", 
	      "grid","gridExtra", "mclust", "NbClust", 
	      "plotly", "pls", "pvclust",
	      "scales", "stargazer", "tidyverse",
	      "tibble", "webshot", "xtable")
package_load(packages)

############################################################
# PART I: PREPARATION/CONSTRUCTION OF DATA
############################################################
# 1) Naming village/module variables and filtering out observations without data
module <- read.dta("ALL_clean_Module.dta")
village <- read.dta("ALL_clean_Village12.dta")
load("relevant_census_data.Rda")

# Group by village and rename relevant variables
village_group <- village %>%  dplyr::rename(village_code = v_q6_village_code, 
		 state = v_q1_state_name, 
		 district = v_q3_district_code,
		 village_pop = village_pop,
		 total_households = total_households) 


# Group by module and rename relevant variables
mod <- module %>% select(hh_id = m1_q3_hhid,
			 hamlet = m1_q12_hamlet,
			 village_code = m1_q11_village_code, 
			 use_grid = m2_q55_grid, 
			 broken = m2_q72_elec_equi_suffer_days, 
			 outage = m2_q71_elec_out_days, 
			 cost = m2_q55_3_grid_spending, 
			 sat = m2_q77_electrified_satisfaction, 
			 day_hours = m2_q69_elec_hrs, 
			 night_hours = m2_q70_elec_night_hrs, 
			 pay_grid = m2_q82_grid_willpay, 
			 grid_interest = m2_q82_grid_interest, 
			 household_id = m1_q3_hhid,
			 state = m1_q8_state,
			 total_adults = m1_q27_no_adults,
			 total_children = m1_q29_no_children,
			 monthly_expenditures = m1_q32_month_expenditure,
			 educ = m1_q23_edu,
			 gov_caste = m1_q25_caste,
			 ngrid_unavailable = m2_q81_grid_availability,
			 ngrid_conexpensive = m2_q811_ngrid_expensive_connect,
			 ngrid_billexpensive = m2_q812_ngrid_expensive_bill,
			 ngrid_unreliable = m2_q813_ngrid_unreliable,
			 ngrid_dk = m2_q814_ngrid_dk,
			 ngrid_other = m2_q815_ngrid_other,
			 lg_lpg_cyl_dist = m4_q103_5_lpg_lcyl_dist,
			 lg_lpg_cyl_mkt = m4_q103_6_lpg_lcyl_mkt,
			 sm_lpg_cyl_dist = m4_q103_8_lpg_scyl_dist,
			 sm_lpg_cyl_mkt = m4_q103_9_lpg_scyl_mkt,
			 pr_lglpg_dist = m4_q103_10_lpg_lcyl_dist_price,
			 pr_smlpg_dist = m4_q103_11_lpg_scyl_dist_price,
			 pr_lglpg_mkt = m4_q103_12_lpg_lcyl_mkt_price,
			 pr_smlpg_mkt = m4_q103_13_lpg_scyl_mkt_price,
			 kerosene_use = m2_q61_kerolamp_use,
			 kerosene_pds = m2_q61_4_kero_liter_PDS,
			 pr_kerosene_pds = m2_q61_5_kero_price_PDS,
			 kerosene_mkt = m2_q61_6_kero_liter_mkt,
			 pr_kerosene_mkt = m2_q61_7_kero_price_mkt,
			 firewood_mkt = m4_q109_3_firewood_mkt,
			 pr_firewood_mkt = m4_q112_firewood_price,
			 dung_purchase = m4_q113_2_dungcake_price,
			 business_activity = m1_q52_business) %>%
mutate(sum_adults_children = ifelse(!is.na(total_adults) & !is.na(total_children), 
				    total_adults + total_children, 
				    ifelse(!is.na(total_adults) & is.na(total_children),
					   total_adults,
					   ifelse(!is.na(total_children) & is.na(total_adults),
						  total_children,
						  NA))),
       scheduled_caste = ifelse(gov_caste == "Scheduled Caste", 1, 0),
       scheduled_tribe = ifelse(gov_caste == "Scheduled Tribe", 1, 0),
       other_bw = ifelse(gov_caste == "Other Backward Class", 1, 0),
       general_caste = ifelse(gov_caste == "General", 1, 0),
       other_caste = ifelse(gov_caste == "Others", 1, 0),
       no_caste = ifelse(gov_caste == "No Caste", 1, 0),
       bw = other_bw + scheduled_caste + scheduled_tribe,
       educ_nf = ifelse(educ == levels(educ)[1], 1, 0),
       educ_5 = ifelse(educ == levels(educ)[2], 1, 0),
       educ_10 = ifelse(educ == levels(educ)[3], 1, 0),
       educ_12 = ifelse(educ == levels(educ)[4], 1, 0),
       educ_grad = ifelse(educ == levels(educ)[5], 1, 0),
       educ_10p = educ_10 + educ_12 + educ_grad,
       ngrid_unavailablen = ifelse(ngrid_unavailable == "Yes", 1, 0),
       ngrid_conexpensiven = ifelse(ngrid_conexpensive == "Yes", 1, 0),
       ngrid_billexpensiven = ifelse(ngrid_billexpensive == "Yes", 1, 0),
       ngrid_unreliablen = ifelse(ngrid_unreliable == "Yes", 1, 0),
       ngrid_dkn =ifelse(ngrid_dk == "Yes", 1, 0),
       ngrid_othern = ifelse(ngrid_other == "" | module$m2_q815_ngrid_other == ".", 0, 1),
       business_activity =ifelse(business_activity == "Yes", 1, 0),
       kerosene_expend = (kerosene_pds*pr_kerosene_pds)+(kerosene_mkt*pr_kerosene_mkt),
       kerosene_expend = ifelse(is.na(kerosene_expend), 0, kerosene_expend),
       total_nrg_expend = (lg_lpg_cyl_dist*pr_lglpg_dist)+
	       (lg_lpg_cyl_mkt*pr_lglpg_mkt)+
	       (sm_lpg_cyl_dist*pr_smlpg_dist)+
	       (sm_lpg_cyl_mkt*pr_smlpg_mkt)+
	       (kerosene_pds*pr_kerosene_pds)+
	       (kerosene_mkt*pr_kerosene_mkt)+
	       (firewood_mkt*pr_firewood_mkt)+
	       dung_purchase)

# Identify villages where sampled that use grid over total sample is between 0 and 1 non-inclusive
to_retain <- mod %>% group_by(village_code) %>% 
	dplyr::summarise(total_use_grid = sum(use_grid == "Yes")) %>% 
	filter(!(total_use_grid == 12 | total_use_grid == 0)) 

# Now filter the module dataset to eliminate village codes with total_use_grid = 0 or 12. Eliminates 2004 modules and 167 villages 6563 households, 547 villages
mod2 <- mod %>% semi_join(to_retain)
vg <- village_group %>% semi_join(to_retain)


# 2) Quality calculations
# For first stage, we want to calculate quality for individuals with electricity
# Here I calculate basic quality variable along with day and night hours to replicate
# Eliminate variables with NA

mod3 <- mod2 %>% filter(use_grid == "Yes" & 
		    day_hours >0 & !is.na(village_code) &
				  !is.na(broken) & 
				  !is.na(outage) & 
				  !is.na(cost)) 

# To calculate quality variable, we take scale hours, outage, broken
# then find Quality = Hrs - Outage - Broken and then rescale it between 0 and 1
# Since we're not comparing hours to quality here, the variable is not rescaled

mod4 <- mod3 %>% mutate(hrs_sc = scale(day_hours)[,1],
				      outage_sc = scale(outage)[,1],
				      broken_sc = scale(broken)[,1],
				      hrs_sc_night = scale(night_hours)[,1],
				      quality = hrs_sc - outage_sc - broken_sc,
				      quality_night = hrs_sc_night - outage_sc - broken_sc,
				      quality_bin = scales::rescale(quality, c(0,1)),
				      quality_bin_n = scales::rescale(quality_night, c(0,1))) %>%
select(-quality_night, -quality, -broken, -outage, -outage_sc, -broken_sc, hrs_sc, hrs_sc_night) %>%
select(village_code, use_grid, cost, sat, day_hours, night_hours, quality_bin, quality_bin_n, household_id)

# Then aggregate mp2 to the village level
# Note that 9 villages fell out (547 to 538) because not enough info to calculate quality 

quality_variables <- mod4 %>% group_by(village_code) %>% 
	dplyr::summarise(EV_hrs_d = mean(day_hours, na.rm = TRUE),
		  EV_hrs_n = mean(night_hours, na.rm = TRUE),
		  EV_qual_d = mean(quality_bin, na.rim = TRUE),
		  EV_qual_n = mean(quality_bin_n, na.rm = TRUE),
		  CTRL_sat = mean(sat, na.rm = TRUE),
		  CTRL_cost_elec = mean(cost, na.rm = TRUE))

#3) Calculate village-specific control variables 

# Now, using the entire module dataset, we want to calculate
# village-specific control and intrumental variables
# using the module and the village datasets

# Using Modules
mod_summ <- mod %>% group_by(village_code, state) %>% 
	dplyr::summarise(mod_agg_expend = mean(monthly_expenditures, na.rm = TRUE),
		  mod_agg_kero_expend = mean(kerosene_expend, na.rm = TRUE),
		  mod_agg_adults = mean(total_adults, na.rm = TRUE),
		  mod_agg_children =  mean(total_children, na.rm = TRUE),
		  mod_agg_sched_caste = mean(scheduled_caste, na.rm = TRUE),
		  mod_agg_sched_tribe =  mean(scheduled_tribe, na.rm = TRUE),
		  mod_agg_other_bw =  mean(other_bw, na.rm = TRUE),
		  mod_agg_general_caste =  mean(general_caste, na.rm = TRUE),
		  mod_agg_other_caste =  mean(other_caste, na.rm = TRUE),
		  mod_agg_no_caste =  mean(no_caste, na.rm = TRUE),
		  mod_agg_bw =  mean(bw, na.rm = TRUE),
		  mod_agg_educ_nf =  mean(educ_nf, na.rm = TRUE),
		  mod_agg_educ_5 =  mean(educ_5, na.rm = TRUE),
		  mod_agg_educ_10 =  mean(educ_10, na.rm = TRUE),
		  mod_agg_educ_12 =  mean(educ_12, na.rm = TRUE),
		  mod_agg_educ_grad =  mean(educ_grad, na.rm = TRUE),
		  mod_agg_educ_10p =  mean(educ_10p, na.rm = TRUE))


# Using Village Data
village_summ <-  vg %>% select(village_code, state, 
			       vill_educ = v_q11_edu,
			       vill_caste = v_q13_caste,
			       vill_expenditures = v_q16_monthly_spend,
			       vill_pop = village_pop,
			       vill_total_hh = total_households,
			       vill_power_hh = power_households)

# Using Census Data
census_summ <- relevant_census_data %>% 
	filter(village_code %in% vg$village_code) %>%
	select(village_code,
	       cens_distance_town = distance_nearest_town,
	       cens_hh_meas = census_household_measurement,
	       cens_population = census_population)


# Using Hamlet Data
hamlet_summ <- mod %>% group_by(hamlet) %>%
	dplyr::summarise(ham_adults = sum(total_adults, na.rm = TRUE),
		  ham_children = sum(total_children, na.rm = TRUE),
		  ham_pop = sum(total_adults + total_children, na.rm = TRUE),
		  ham_pop_mean = mean(total_adults + total_children, na.rm = TRUE),
		  ham_expend = mean(monthly_expenditures, na.rm = TRUE),
		  ham_kerosene_expend = mean(kerosene_expend, na.rm = TRUE),
		  ham_business = mean(business_activity, na.rm = TRUE),
		  ham_hh = n()) 

# 4) Combining Quality and willingness to pay, filtering out households that do not use grid


first_stage <- mod %>% 
	select(household_id, hamlet,
	       village_code, use_grid,
	       willingness_to_pay = pay_grid,
	       grid_interest,
	       unit_expend = monthly_expenditures,
	       unit_educ = educ,
	       unit_educ_10p = educ_10p,
	       unit_caste = gov_caste,
	       unit_bw = bw,
	       ngrid_unavailablen,
	       ngrid_conexpensiven,
	       ngrid_billexpensiven,
	       ngrid_unreliablen,
	       ngrid_dkn,
	       ngrid_othern,
	       kerosene_use,
	       kerosene_expend,
	       kerosene_pds,
	       pr_kerosene_pds,
	       kerosene_mkt,
	       pr_kerosene_mkt,
	       sum_adults_children) %>%
	inner_join(mod_summ) %>%
	inner_join(hamlet_summ) %>% 
	inner_join(census_summ) %>% 
	inner_join(village_summ) %>%
	inner_join(quality_variables) %>%
	mutate(willingness_to_pay = 
	       ifelse(grid_interest == "No",
		      0, willingness_to_pay)) 

second_stage <- first_stage %>% filter(use_grid == "No", !is.na(willingness_to_pay)) 

# Note: a csv of second_stage with "NA" values replaced with blank cells
# (first_stage.csv) is used for selectionModel.do. 


############################################################
# PART II: FIGURES 
############################################################
# Figure 1
cormatrix.df <- first_stage %>% 
	filter(use_grid == "No", !is.na(willingness_to_pay)) %>% 
	select("WTP" = willingness_to_pay,
	       "Available Hours" = EV_hrs_d, 
	       "Nighttime Hours" = EV_hrs_n, 
	       "Quality (total)" = EV_qual_d, 
	       "Quality (nighttime)" = EV_qual_n)
	
corr_plot <- cor(cormatrix.df)
pdf("Figure_1.pdf", height = 5)
corrplot::corrplot(corr_plot, method = "number", type = "lower")
dev.off()

# Figure 2 --- see "selectionModel.do" 
# Figure 3 --- see "selectionModel.do"

# Figure A1
dv_distribution_wtp_nt <- ggplot(second_stage %>% 
				  filter(willingness_to_pay <2500), 
			  aes(willingness_to_pay)) + 
geom_histogram(bins = 20) + 
xlab("Willingness to Pay") + 
ylab("Number of Households") + 
theme_tufte() + 
theme(axis.text.x = element_text(size = 16),
      axis.title.x = element_text(size = 18),
      axis.text.y = element_text(size = 16),
      axis.title.y = element_text(size = 18))

second_stage <- second_stage %>% 
	mutate(wtp_log = log(willingness_to_pay+1))

dv_distribution_wtp_log_nt <- ggplot(second_stage, aes(wtp_log)) +
	geom_histogram(bins = 50) + xlab("Willingness to Pay (ln + 1)") + 
	ylab("Number of Households") + 
	theme_tufte() + 
	theme(axis.text.x = element_text(size = 16),
	      axis.title.x = element_text(size = 18),
	      axis.text.y = element_text(size = 16),
	      axis.title.y = element_text(size = 18))

pdf("Figure_A1_1.pdf", height = 5)
dv_distribution_wtp_nt
dev.off()


pdf("Figure_A1_2.pdf", height = 5)
dv_distribution_wtp_log_nt
dev.off()

# Figure A2
varnames = c("Available Hours", "Nighttime Hours")
color_scheme = c("black", "gray52")
size_scheme = c(0.3, 0.9)
shape_scheme = c(20, 4)

hours_df <- second_stage %>% 
	select(household_id, willingness_to_pay, EV_hrs_n, EV_hrs_d) %>% 
	gather(variable, value, -c(1:2)) %>%
	select(-household_id) %>% 
	mutate(variable = ifelse(variable == "EV_hrs_d", 
				 varnames[1], varnames[2])) 

hours_df_log <- second_stage %>% 
	select(household_id, wtp_log, EV_hrs_n, EV_hrs_d) %>% 
	gather(variable, value, -c(1:2)) %>% 
	select(-household_id) %>% 
	mutate(variable = ifelse(variable == "EV_hrs_d", 
				 varnames[1], varnames[2])) 


ev_distribution_hrs_day_nt <- ggplot(second_stage %>% 
				      select(EV_hrs_d),
				      aes(EV_hrs_d)) + 
geom_histogram(bins = 50) + 
xlab("Available Hours") + 
ylab("Number of Households") + 
theme_tufte()  +
theme(axis.text.x = element_text(size = 16),
      axis.title.x = element_text(size = 18),
      axis.text.y = element_text(size = 16),
      axis.title.y = element_text(size = 18))

pdf("Figure_A2.pdf", width = 7)
ev_distribution_hrs_day_nt
dev.off()

# Figure A3
ev_distribution_quality_nt <- second_stage %>% 
	select(EV_qual_d) %>% 
	ggplot(aes(EV_qual_d)) + 
	geom_histogram(bins = 50) +
	xlab("Quality Index") + 
	ylab("Number of Households") + 
	theme_tufte() +
theme(axis.text.x = element_text(size = 16),
      axis.title.x = element_text(size = 18),
      axis.text.y = element_text(size = 16),
      axis.title.y = element_text(size = 18))

pdf("Figure_A3.pdf", width = 7)
ev_distribution_quality_nt
dev.off()

# Figure A4
ev_distribution_hrs_night_nt <- ggplot(second_stage %>% 
				      select(EV_hrs_n),
				      aes(EV_hrs_n)) + 
geom_histogram(bins = 50) + 
xlab("Nighttime Hours") + 
ylab("Number of Households") + 
theme_tufte()  +
theme(axis.text.x = element_text(size = 16),
      axis.title.x = element_text(size = 18),
      axis.text.y = element_text(size = 16),
      axis.title.y = element_text(size = 18))

pdf("Figure_A4.pdf", width = 7)
ev_distribution_hrs_night_nt
dev.off()

# Figure A5
ev_dv_hours_d_nt <- ggplot(hours_df %>% filter(variable == varnames[1]), 
		     aes(x = willingness_to_pay, 
			 y = value)) + 
geom_point(color = color_scheme[1], shape = shape_scheme[1], size = size_scheme[1]) + 
xlab("Willingness to Pay") + 
ylab("Available Hours") + theme_tufte() +
theme(legend.position="bottom") + 
geom_smooth(method = "lm", size = 0.5) +
theme(axis.text.x = element_text(size = 10),
      axis.title.x = element_text(size = 12),
      axis.text.y = element_text(size = 10),
      axis.title.y = element_text(size = 12))


ev_dv_hours_d_nt_log <- ggplot(hours_df_log %>% 
			       filter(variable == varnames[1]), 
		     aes(x = wtp_log, 
			 y = value)) + 
geom_point(color = color_scheme[1], 
	   shape = shape_scheme[1], 
	   size = size_scheme[1]) + 
xlab("Willingness to Pay (ln + 1)") + 
ylab("Available Hours") + theme_tufte() +
theme(legend.position="bottom") + 
geom_smooth(method = "lm", size = 0.5) +
theme(axis.text.x = element_text(size = 10),
      axis.title.x = element_text(size = 12),
      axis.text.y = element_text(size = 10),
      axis.title.y = element_text(size = 12))

pdf("Figure_A5_1.pdf", height = 3)
ev_dv_hours_d_nt
dev.off()

pdf("Figure_A5_2.pdf", height = 3)
ev_dv_hours_d_nt_log
dev.off()


# Figure A6
ev_dv_qual_nt <- ggplot(second_stage %>% 
			select(willingness_to_pay, EV_qual_d), 
		     aes(x = willingness_to_pay, 
			 y = EV_qual_d)) + 
geom_point(color = "black", shape = 16, size = 0.5) + 
xlab("Willingness to Pay") + 
ylab("Quality Index") + 
theme_tufte() + geom_smooth(method = "lm", size = 0.5) +
theme(axis.text.x = element_text(size = 10),
      axis.title.x = element_text(size = 12),
      axis.text.y = element_text(size = 10),
      axis.title.y = element_text(size = 12))


ev_dv_qual_nt_log <- ggplot(second_stage %>% 
			    select(wtp_log, EV_qual_d), 
		     aes(x = wtp_log, 
			 y = EV_qual_d)) + 
geom_point(color = "black", shape = 16, size = 0.5) + 
xlab("Willingness to Pay (ln + 1)") + 
ylab("Quality Index") +
theme_tufte() + geom_smooth(method = "lm", size = 0.5) +
theme(axis.text.x = element_text(size = 10),
      axis.title.x = element_text(size = 12),
      axis.text.y = element_text(size = 10),
      axis.title.y = element_text(size = 12))

pdf("Figure_A6_1.pdf", height = 3)
ev_dv_qual_nt
dev.off()

pdf("Figure_A6_2.pdf", height = 3)
ev_dv_qual_nt_log
dev.off()



# Figure A7
ev_dv_hours_n_nt <- ggplot(hours_df %>% filter(variable == varnames[2]), 
		     aes(x = willingness_to_pay, 
			 y = value)) + 
geom_point(color = color_scheme[1], shape = shape_scheme[1], size = size_scheme[1]) + 
xlab("Willingness to Pay") + 
ylab("Nighttime Hours") + 
theme_tufte() +
theme(legend.position="bottom") + 
geom_smooth(method = "lm", size = 0.5) +
theme(axis.text.x = element_text(size = 10),
      axis.title.x = element_text(size = 12),
      axis.text.y = element_text(size = 10),
      axis.title.y = element_text(size = 12))



ev_dv_hours_n_nt_log <- ggplot(hours_df_log %>% 
			       filter(variable == varnames[2]), 
		     aes(x = wtp_log, 
			 y = value)) + 
geom_point(color = color_scheme[1], 
	   shape = shape_scheme[1], 
	   size = size_scheme[1]) + 
xlab("Willingness to Pay (ln + 1)") + 
ylab("Nighttime Hours") + 
theme_tufte() +
theme(legend.position="bottom") + 
geom_smooth(method = "lm", size = 0.5) +
theme(axis.text.x = element_text(size = 10),
      axis.title.x = element_text(size = 12),
      axis.text.y = element_text(size = 10),
      axis.title.y = element_text(size = 12))

pdf("Figure_A7_1.pdf", height = 3)
ev_dv_hours_n_nt
dev.off()

pdf("Figure_A7_2.pdf", height = 3)
ev_dv_hours_n_nt_log
dev.off()


# Figure A8
hamlet_data <- second_stage %>% 
	select(household_id, 
	       hamlet, 
	       ham_pop, 
	       ham_hh, 
	       ham_pop_mean) %>% 
gather("variable", "value", -c(1:2))


fill_scheme = c("gray12", "gray0", "gray72")

iv_distribution_nt <- ggplot(hamlet_data, aes(value)) +
	geom_density(aes(value, fill = variable), 
		     position = "jitter", alpha = 0.5) + 
xlab("Value (ln)") + 
ylab("Density") + 
theme_tufte() + 
scale_x_continuous(trans = "log", labels = trans_format("log", waiver())) +
scale_fill_manual(name = "Variable", 
		  values = fill_scheme, 
		  labels = c("Habitation Population", 
			     "Habitation Households", 
			     "Mean Habitation Household Size"))  +
theme(axis.text.x = element_text(size = 12),
      axis.title.x = element_text(size = 14),
      axis.text.y = element_text(size = 12),
      axis.title.y = element_text(size = 14),
      legend.position = "bottom")

pdf("Figure_A8.pdf", height = 5)
iv_distribution_nt
dev.off()


# Figure A9
hamlet_data_exp <- second_stage %>% select(household_id, ham_expend, ham_kerosene_expend)

iv_distribution_exp_nt <- ggplot(hamlet_data_exp, aes(ham_expend)) + 
	geom_density(fill = "gray0", alpha = 0.5) + 
	xlab("Value (ln)")+ ylab("Density") + 
	theme_tufte() + 
	scale_x_continuous(trans = "log", 
			   labels = trans_format("log", waiver()))  +
theme(axis.text.x = element_text(size = 12),
      axis.title.x = element_text(size = 14),
      axis.text.y = element_text(size = 12),
      axis.title.y = element_text(size = 14))

pdf("Figure_A9.pdf", height = 5)
iv_distribution_exp_nt
dev.off()

# Figure A10
iv_distribution_kerosene_exp_nt <- ggplot(hamlet_data_exp, aes(ham_kerosene_expend)) + 
	geom_density(fill = "gray12", alpha = 0.5) + 
	xlab("Value (ln)")+ ylab("Density") + 
	theme_tufte() + 
	scale_x_continuous(trans = "log", 
			   labels = trans_format("log", waiver()))  +
theme(axis.text.x = element_text(size = 12),
      axis.title.x = element_text(size = 14),
      axis.text.y = element_text(size = 12),
      axis.title.y = element_text(size = 14))

pdf("Figure_A10.pdf", height = 5)
iv_distribution_kerosene_exp_nt
dev.off()

# Figure A11 reasonNoConnect.pdf

############################################################
# PART III: TABLES
############################################################
# Table 1
# Characteristics of selected v. unselected villages
# We use "mod" because we want to characterize statistics for 
# all households (elec and non-elec, mixed use and not)  

village_comparison <-  mod %>%
	group_by(village_code) %>% 
	dplyr::summarise(total_use_grid = mean(use_grid == "Yes"),
		  avg_size_hh_sampled = mean(sum_adults_children),
		  samp_prop_bw = mean(bw),
		  samp_mean_educ_10p = mean(educ_10p),
		  samp_mean_expenditures = mean(monthly_expenditures),
		  samp_mean_kero_expenditures = mean(kerosene_expend)) %>% 
	inner_join(relevant_census_data) %>%
	mutate(all_use = ifelse(total_use_grid == 1, 1, 0),
	       no_use = ifelse(total_use_grid == 0, 1, 0),
	       retained = ifelse(all_use + no_use == 0, 1, 0),
	       type = ifelse(all_use == 1, "All Use", ifelse(no_use == 1, "No Use", "Mixed Use"))) %>%
	ungroup() %>% group_by(type) %>% 
	dplyr::summarise(n(),
		  mean(total_use_grid), 
		  mean(avg_size_hh_sampled),
		  mean(samp_prop_bw),
		  mean(samp_mean_educ_10p),
		  mean(samp_mean_expenditures),
		  mean(log(samp_mean_expenditures)),
		  mean(samp_mean_kero_expenditures),
		  mean(log(samp_mean_kero_expenditures + 1)),
		  mean(census_household_measurement),
		  mean(log(census_household_measurement)),
		  mean(distance_nearest_town),
		  mean(log(distance_nearest_town+1)),
		  mean(census_population),
		  mean(log(census_population)))

summary_village_chart <- village_comparison %>% 
	filter(!(is.na(type))) %>% 
	gather(key = "variable", value = "value", -type) %>%
	spread(type, value) %>% 
	mutate_at(c(2:4), .funs = round, digits = 2) %>%
	slice(c(15, 14, 1, 13, 10, 11, 8, 12, 9, 2, 5, 4, 7, 3, 6)) %>%
	mutate(variable = c("Villages", 
			    "Use Grid",
			    "Sampled Household Size",
			    "Backwards Caste",
			    "10th Year or Greater Education",
			    "Household Expenditure",
			    "Household Expenditure (ln)",
			    "Kerosene Expenditure",
			    "Kerosene Expenditure (ln + 1)",
			    "Households",
			    "Households (ln)",
			    "Distance to Town",
			    "Distance to Town (ln + 1)",
			    "Village Population",
			    "Village Population (ln)")) %>%
	dplyr::rename(Description = variable)

x1 <- xtable(summary_village_chart, auto = TRUE)

print.xtable(x1, 
	     type = "latex", 
	     file = "Table_1.tex", 
	     floating = FALSE, 
	     include.rownames=FALSE,
	     format.args = list(big.mark = ","))


# Table 2
# Summary within villages

first_stage <- first_stage %>% dplyr::rename(unit_kerosene_expend = kerosene_expend)
second_stage <- second_stage %>% dplyr::rename(unit_kerosene_expend = kerosene_expend)

grid_comparison_1 <- first_stage %>% 
	filter(!(use_grid == "No" & is.na(willingness_to_pay) == TRUE)) %>%
	group_by(use_grid) %>% 
	dplyr::summarise(n(),
		  log(n()),
		  mean(ham_hh),
		  mean(log(ham_hh)),
		  mean(unit_expend),
		  mean(log(unit_expend)),
		  mean(unit_kerosene_expend),
		  mean(log(unit_kerosene_expend + 1)),
		  mean(unit_educ_10p),
		  mean(unit_bw),
		  mean(EV_hrs_d),
		  mean(EV_qual_d),
		  mean(EV_hrs_n))

summary_grid_chart <- grid_comparison_1 %>% 
	gather(key = "variable", value = "value", -use_grid) %>%
	spread(use_grid, value) %>% 
	mutate_at(c(2:3), .funs = round, digits = 2) %>%
	slice(c(13, 1, 5, 6, 11, 7, 12, 8, 10, 9, 2, 4, 3)) %>%
	mutate(variable = c("Households",
			    "Households (ln)",
			    "Habitation Households",
			    "Habitation Households (ln)",
			    "Household Expenditure",
			    "Household Expenditure (ln)",
			    "Kerosene Expenditure",
			    "Kerosene Expenditure (ln + 1)",
			    "10th Year or Greater Education",
			    "Backwards Caste",
			    "Available Hours",
			    "Quality Index",
			    "Nighttime Hours")) %>%
	dplyr::rename(Description = variable, 
		 NotElectrified = No, 
		 Electrified = Yes)

summary_village_chart <- summary_village_chart %>% select(Description, 'Mixed Use', 'All Use', 'No Use')

x2 <- xtable(summary_grid_chart, auto = TRUE)
names(x2) <- c("Description", "Not Electrified", "Electrified")

print.xtable(x2, 
	     type = "latex", 
	     file = "Table_2.tex", 
	     floating = FALSE, 
	     include.rownames=FALSE,
	     format.args = list(big.mark = ","))

# Table 3
# Summary sample statistics
summary_statistics_village <- first_stage %>% 
	filter(!(use_grid == "No" & is.na(willingness_to_pay) == TRUE)) %>%
	mutate(vill_hh = vill_pop/vill_total_hh,
	       vill_prop = vill_power_hh/vill_total_hh) %>%
	select(village_code, willingness_to_pay,
		       starts_with("EV"),
		       starts_with("CTRL_"),
		       starts_with("ham_"),
		       starts_with("mod_agg_"),
		       starts_with("cens_"),
		       vill_pop,
		       vill_total_hh,
		       vill_prop,
		       vill_hh) %>% 
group_by(village_code) %>% 
dplyr::summarise_all(funs(mean(., na.rm = TRUE))) %>% ungroup() %>%
dplyr::summarise_all(funs(avg = mean(., na.rm = TRUE), 
			  low = min(., na.rm = TRUE), 
			  hig = max(., na.rm = TRUE), 
			  std = sd(., na.rm = TRUE))) %>% 
gather(key = "variable", value = "value") %>% 
filter(str_sub(variable, 1,12) != "village_code") %>%
mutate(stat_type = ifelse(grepl("avg$", variable), "Average",
		     ifelse(grepl("low$", variable), "Minimum",
			    ifelse(grepl("hig$", variable), "Maximum", "Standard Deviation")))) %>%
mutate(var_type =  str_sub(variable, 1, -5)) %>% 
select(-variable) %>% spread(stat_type, value) %>%
select(Variable = var_type, Minimum, Average, Maximum, "Standard Deviation") %>% 
slice(c(38, 39, 36, 35, 37, 6:9, 14, 16, 1, 4, 27, 29, 22, 19)) %>%
mutate(Variable = c("Households",
		    "Willingness to Pay",
		    "Village Population",
		    "Household Size",
		    "Use Grid (Households)",
		    "Available Hours",
		    "Nighttime Hours",
		    "Quality (Available Hours)",
		    "Quality (Nighttime Hours)",
		    "Habitation Households",
		    "Habitation Population",
		    "Distance to Town",
		    "Electricity Costs",
		    "Household Expenditure (Village Averages)",
		    "Kerosene Expenditure (Villages Averages)",
		    "10th Year or Greater Education",
		    "Backwards Caste"))
 


summary_statistics_ind <-  second_stage %>% 
	select(willingness_to_pay, unit_expend, 
	       unit_kerosene_expend, sum_adults_children) %>%
	dplyr::summarise_all(funs(avg = mean(., na.rm = TRUE), 
			  low = min(., na.rm = TRUE), 
			  hig = max(., na.rm = TRUE), 
			  std = sd(., na.rm = TRUE))) %>% gather(key = "variable") %>% 
mutate(stat_type = ifelse(grepl("avg$", variable), "Average",
		     ifelse(grepl("low$", variable), "Minimum",
			    ifelse(grepl("hig$", variable), "Maximum", "Standard Deviation")))) %>%
mutate(var_type = str_sub(variable, 1, -5)) %>% 
select(-variable) %>% 
spread(stat_type, value) %>% 
select(Variable = var_type, Minimum, Average, Maximum, "Standard Deviation") %>% 
slice(c(4, 2, 3, 1)) %>% 
mutate(Variable = c("Willingess to Pay (Household-Level)",
		    "Household Expenditure (Household-Level)",
		    "Kerosene Expenditure (Household-Level)",
		    "Household Size (Household-Level)"))

summary_statistics <- rbind(summary_statistics_village, summary_statistics_ind)

x3 <- xtable(summary_statistics, auto = TRUE, size = footnote, digits = 2)

print.xtable(x3, type = "latex", 
	     file = "Table_3.tex",
	     floating = FALSE, 
	     include.rownames = FALSE, 
	     hline.after = c(-1, 0, 17, 21), 
	     booktabs = TRUE,
	     format.args = list(big.mark =","),
	     digits = 2)


# Table 4: regmodel.tex --- see "selectionModel.do"
# Table 5: selectmodel.tex --- see "selectionModel.do"
# Table A1 selectmodelBus.tex--- see "selectionModel.do"

